# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.930.107.6 -> 1.930.107.7 # include/linux/sysctl.h 1.23 -> 1.24 # fs/proc/array.c 1.10 -> 1.11 # include/linux/mm.h 1.44 -> 1.45 # mm/mmap.c 1.29.1.2 -> 1.29.1.3 # kernel/sysctl.c 1.19 -> 1.20 # fs/Config.in 1.21 -> 1.22 # include/asm-ia64/mmu_context.h 1.4 -> 1.4.1.1 # include/asm-ia64/page.h 1.6 -> 1.7 # fs/proc/proc_misc.c 1.20 -> 1.21 # arch/ia64/kernel/ivt.S 1.6 -> 1.7 # arch/ia64/mm/Makefile 1.1.1.1 -> 1.1.1.2 # mm/mprotect.c 1.4 -> 1.5 # mm/memory.c 1.54.1.2 -> 1.54.1.3 # ipc/shm.c 1.11 -> 1.12 # arch/ia64/config.in 1.13.2.1 -> 1.13.2.2 # mm/mremap.c 1.5 -> 1.6 # arch/ia64/kernel/sys_ia64.c 1.8 -> 1.8.1.1 # fs/Makefile 1.16 -> 1.17 # include/linux/shm.h 1.1 -> 1.2 # fs/inode.c 1.36 -> 1.37 # (new) -> 1.1 fs/hugetlbfs/Makefile # (new) -> 1.1 include/linux/hugetlb.h # (new) -> 1.1 arch/ia64/mm/hugetlbpage.c # (new) -> 1.1 fs/hugetlbfs/inode.c # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/05/15 eranian@hpl.hp.com 1.930.112.27 # ia64: perfmon TLB_* and ALAT event fix # # Please apply this on top of what I sent last week. This fixes a problem # reported by UNSW on McKinley with the DATA_EAR_TLB_* or DATA_EAR_ALAT # event. There was also a bug in the libpfm library, but the generalized # solution requires a small kernel fix. Basically, we must let the users # reprogram the PMC to their default value even though this violates some # of the session requirements. This is safe because default PMC values # ensure that nothing get measured. # -------------------------------------------- # 03/05/15 bjorn_helgaas@hp.com 1.930.1.195 # Merge hp.com:/home/helgaas/bk/ia64-extras # into hp.com:/home/helgaas/bk/linux-ia64-2.4 # -------------------------------------------- # 03/05/15 bjorn_helgaas@hp.com 1.930.1.196 # Merge hp.com:/home/helgaas/bk/to-marcelo-2.4 # into hp.com:/home/helgaas/bk/linux-ia64-2.4 # -------------------------------------------- # 03/05/15 yoshfuji@linux-ipv6.org 1.930.130.33 # [IPV6]: ARCnet support, driver side. # -------------------------------------------- # 03/05/15 yoshfuji@linux-ipv6.org 1.930.130.34 # [IPV6]: ARCnet support, protocol side. # -------------------------------------------- # 03/05/16 davidm@tiger.hpl.hp.com 1.930.112.28 # ia64: Fix INIT copying of banked registers. # -------------------------------------------- # 03/05/16 davidm@tiger.hpl.hp.com 1.930.112.29 # ia64: ptrace: don't let reading NaT bits for R4-R7 overwrite the value # we're intending to write; get_rnat & put_rnat cleanups. # -------------------------------------------- # 03/05/16 davidm@tiger.hpl.hp.com 1.930.112.30 # ia64: Fix ptrace() RNaT accessors. # -------------------------------------------- # 03/05/16 bjorn_helgaas@hp.com 1.930.112.31 # ia64: ptrace whitespace changes to follow 2.5. # -------------------------------------------- # 03/05/16 agrover@groveronline.com 1.980 # ACPI: acpi=off also implies drivers should not load (Zdenek Ogar Skalak) # -------------------------------------------- # 03/05/16 davidm@tiger.hpl.hp.com 1.930.112.32 # ia64: Fix page-fault handler so it handles not-present translations for region 5 # (patch by John Marvin). # -------------------------------------------- # 03/05/16 bjorn_helgaas@hp.com 1.930.1.197 # Merge hp.com:/home/helgaas/bk/to-marcelo-2.4 # into hp.com:/home/helgaas/bk/linux-ia64-2.4 # -------------------------------------------- # 03/05/17 dwmw2@dwmw2.baythorne.internal 1.930.143.1 # Switch to shared optimised CRC32 functions. # -------------------------------------------- # 03/05/19 dwmw2@dwmw2.baythorne.internal 1.930.143.2 # Add config help for CONFIG_CRC32 (Duncan Sands ) # -------------------------------------------- # 03/05/19 trini@kernel.crashing.org 1.930.141.3 # PPC32: Allow for the RTC IRQ to be board-defined. # From David Mueller . # -------------------------------------------- # 03/05/19 agrover@groveronline.com 1.981 # ACPI: Update Toshiba driver to 0.15 (John Belmonte) # - workaround sporadic problem with hotkey ceasing to work # - cleanups # -------------------------------------------- # 03/05/19 maxk@qualcomm.com 1.930.138.6 # [Bluetooth] Add support for SO_LINGER option to all Bluetooth protocols. # This is required to pass Bluetooth qualification tests. # Also fix error handling in L2CAP and RFCOMM sockets. # -------------------------------------------- # 03/05/19 rohit.seth@intel.com 1.930.107.7 # Hugetlb support for ia64. # # Please find attached a hugetlb page patch for 2.4.20 kernel. This # patch is very close to the current hugepage support in 2.5 base (and # RH and SuSE) kernels. Little testing on this patch is done. # -------------------------------------------- # diff -Nru a/arch/ia64/config.in b/arch/ia64/config.in --- a/arch/ia64/config.in Wed Oct 8 09:09:54 2003 +++ b/arch/ia64/config.in Wed Oct 8 09:09:54 2003 @@ -86,6 +86,33 @@ define_bool CONFIG_KCORE_ELF y # On IA-64, we always want an ELF /proc/kcore. +define_int CONFIG_FORCE_MAX_ZONEORDER 19 + +bool 'IA-64 Huge TLB Page Support' CONFIG_HUGETLB_PAGE + +if [ "$CONFIG_HUGETLB_PAGE" = "y" ]; then + if [ "$CONFIG_MCKINLEY" = "y" ]; then + choice ' IA-64 Huge TLB Page Size' \ + "4GB CONFIG_HUGETLB_PAGE_SIZE_4GB \ + 1GB CONFIG_HUGETLB_PAGE_SIZE_1GB \ + 256MB CONFIG_HUGETLB_PAGE_SIZE_256MB \ + 64MB CONFIG_HUGETLB_PAGE_SIZE_64MB \ + 16MB CONFIG_HUGETLB_PAGE_SIZE_16MB \ + 4MB CONFIG_HUGETLB_PAGE_SIZE_4MB \ + 1MB CONFIG_HUGETLB_PAGE_SIZE_1MB \ + 256KB CONFIG_HUGETLB_PAGE_SIZE_256KB" 16MB + else + choice ' IA-64 Huge TLB Page Size' \ + "256MB CONFIG_HUGETLB_PAGE_SIZE_256MB \ + 64MB CONFIG_HUGETLB_PAGE_SIZE_64MB \ + 16MB CONFIG_HUGETLB_PAGE_SIZE_16MB \ + 4MB CONFIG_HUGETLB_PAGE_SIZE_4MB \ + 1MB CONFIG_HUGETLB_PAGE_SIZE_1MB \ + 256KB CONFIG_HUGETLB_PAGE_SIZE_256KB" 16MB + fi +fi + + bool 'SMP support' CONFIG_SMP tristate 'Support running of Linux/x86 binaries' CONFIG_IA32_SUPPORT bool 'Performance monitor support' CONFIG_PERFMON diff -Nru a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S --- a/arch/ia64/kernel/ivt.S Wed Oct 8 09:09:54 2003 +++ b/arch/ia64/kernel/ivt.S Wed Oct 8 09:09:54 2003 @@ -114,6 +114,10 @@ * - the faulting virtual address has no L1, L2, or L3 mapping */ mov r16=cr.ifa // get address that caused the TLB miss +#ifdef CONFIG_HUGETLB_PAGE + movl r18=PAGE_SHIFT + mov r25=cr.itir +#endif ;; rsm psr.dt // use physical addressing for data mov r31=pr // save the predicate registers @@ -121,8 +125,18 @@ shl r21=r16,3 // shift bit 60 into sign bit shr.u r17=r16,61 // get the region number into r17 ;; + shr r22=r21,3 +#ifdef CONFIG_HUGETLB_PAGE + extr.u r26=r25,2,6 + ;; + cmp.eq p8,p0=HPAGE_SHIFT,r26 + ;; +(p8) dep r25=r18,r25, 2, 6 +(p8) shr r22=r22,HPAGE_SHIFT-PAGE_SHIFT //Changed r16 to r22 below +#endif + ;; cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5? - shr.u r18=r16,PGDIR_SHIFT // get bits 33-63 of the faulting address + shr.u r18=r22,PGDIR_SHIFT // get bits 33-63 of the faulting address ;; (p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place srlz.d // ensure "rsm psr.dt" has taken effect @@ -133,7 +147,7 @@ (p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8 (p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8) cmp.eq p7,p6=0,r21 // unused address bits all zeroes? - shr.u r18=r16,PMD_SHIFT // shift L2 index into position + shr.u r18=r22,PMD_SHIFT //shift L2 index into position ;; ld8 r17=[r17] // fetch the L1 entry (may be 0) ;; @@ -141,7 +155,7 @@ dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry ;; (p7) ld8 r20=[r17] // fetch the L2 entry (may be 0) - shr.u r19=r16,PAGE_SHIFT // shift L3 index into position + shr.u r19=r22,PAGE_SHIFT // shift L3 index into position ;; (p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L2 entry NULL? dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L3 page table entry @@ -159,6 +173,10 @@ (p11) itc.d r18 // insert the data TLB entry (p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault) mov cr.ifa=r22 + +#ifdef CONFIG_HUGETLB_PAGE +(p8) mov cr.itir=r25 // Change the page size to 16k for VHPT +#endif /* * Now compute and insert the TLB entry for the virtual page table. We never diff -Nru a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c --- a/arch/ia64/kernel/sys_ia64.c Wed Oct 8 09:09:54 2003 +++ b/arch/ia64/kernel/sys_ia64.c Wed Oct 8 09:09:54 2003 @@ -19,6 +19,8 @@ #include #include +#include + unsigned long arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -29,6 +31,10 @@ if (len > RGN_MAP_LIMIT) return -ENOMEM; +#ifdef CONFIG_HUGETLB_PAGE + if(rgn_index(addr)==REGION_HPAGE) + addr=0; +#endif if (!addr) addr = TASK_UNMAPPED_BASE; diff -Nru a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile --- a/arch/ia64/mm/Makefile Wed Oct 8 09:09:54 2003 +++ b/arch/ia64/mm/Makefile Wed Oct 8 09:09:54 2003 @@ -12,5 +12,6 @@ export-objs := init.o obj-y := init.o fault.o tlb.o extable.o +obj-$(CONFIG_HUGETLB_PAGE)+=hugetlbpage.o include $(TOPDIR)/Rules.make diff -Nru a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/ia64/mm/hugetlbpage.c Wed Oct 8 09:09:54 2003 @@ -0,0 +1,468 @@ +/* + * IA-64 Huge TLB Page Support for Kernel. + * + * Copyright (C) 2002, Rohit Seth + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define TASK_HPAGE_BASE (REGION_HPAGE << REGION_SHIFT) + +static long htlbpagemem; +int htlbpage_max; +static long htlbzone_pages; + +struct vm_operations_struct hugetlb_vm_ops; +static LIST_HEAD(htlbpage_freelist); +static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED; + +static struct page *alloc_hugetlb_page(void) +{ + int i; + struct page *page; + + spin_lock(&htlbpage_lock); + if (list_empty(&htlbpage_freelist)) { + spin_unlock(&htlbpage_lock); + return NULL; + } + + page = list_entry(htlbpage_freelist.next, struct page, list); + list_del(&page->list); + htlbpagemem--; + spin_unlock(&htlbpage_lock); + set_page_count(page, 1); + for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) + clear_highpage(&page[i]); + return page; +} + +pte_t * +huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +{ + unsigned long taddr = htlbpage_to_page(addr); + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, taddr); + pmd = pmd_alloc(mm, pgd, taddr); + if (pmd) + pte = pte_alloc(mm, pmd, taddr); + return pte; +} + +pte_t * +huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + unsigned long taddr = htlbpage_to_page(addr); + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, taddr); + pmd = pmd_offset(pgd, taddr); + pte = pte_offset(pmd, taddr); + return pte; +} + +#define mk_pte_huge(entry) {pte_val(entry) |= _PAGE_P;} + +void +set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, int write_access) +{ + pte_t entry; + + mm->rss += (HPAGE_SIZE / PAGE_SIZE); + if (write_access) { + entry = + pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } else + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = pte_mkyoung(entry); + mk_pte_huge(entry); + set_pte(page_table, entry); + return; +} + +/* + * This function checks for proper alignment of input addr and len parameters. + */ +int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + if (REGION_NUMBER(addr) != REGION_HPAGE) + return -EINVAL; + return 0; +} + +/* This function checks if the address and address+len falls out of HugeTLB region. It + * return -EINVAL if any part of address range falls in HugeTLB region. + */ +int is_invalid_hugepage_range(unsigned long addr, unsigned long len) +{ + if (REGION_NUMBER(addr) == REGION_HPAGE) + return -EINVAL; + if (REGION_NUMBER(addr+len) == REGION_HPAGE) + return -EINVAL; + return 0; +} + +int +copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + while (addr < end) { + dst_pte = huge_pte_alloc(dst, addr); + if (!dst_pte) + goto nomem; + src_pte = huge_pte_offset(src, addr); + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + set_pte(dst_pte, entry); + dst->rss += (HPAGE_SIZE / PAGE_SIZE); + addr += HPAGE_SIZE; + } + return 0; + +nomem: + return -ENOMEM; +} + +int +follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *st, int *length, int i) +{ + pte_t *ptep, pte; + unsigned long start = *st; + unsigned long pstart; + int len = *length; + struct page *page; + + do { + pstart = start; + ptep = huge_pte_offset(mm, start); + pte = *ptep; + +back1: + page = pte_page(pte); + if (pages) { + page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT); + pages[i] = page; + } + if (vmas) + vmas[i] = vma; + i++; + len--; + start += PAGE_SIZE; + if (((start & HPAGE_MASK) == pstart) && len && + (start < vma->vm_end)) + goto back1; + } while (len && start < vma->vm_end); + *length = len; + *st = start; + return i; +} + +void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + BUG_ON(page->mapping); + + INIT_LIST_HEAD(&page->list); + + spin_lock(&htlbpage_lock); + list_add(&page->list, &htlbpage_freelist); + htlbpagemem++; + spin_unlock(&htlbpage_lock); +} + +void huge_page_release(struct page *page) +{ + if (!put_page_testzero(page)) + return; + + free_huge_page(page); +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + struct page *page; + + BUG_ON(start & (HPAGE_SIZE - 1)); + BUG_ON(end & (HPAGE_SIZE - 1)); + + for (address = start; address < end; address += HPAGE_SIZE) { + pte = huge_pte_offset(mm, address); + if (pte_none(*pte)) + continue; + page = pte_page(*pte); + huge_page_release(page); + pte_clear(pte); + } + mm->rss -= (end - start) >> PAGE_SHIFT; + flush_tlb_range(mm, start, end); +} + +void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long length) +{ + struct mm_struct *mm = vma->vm_mm; + spin_lock(&mm->page_table_lock); + unmap_hugepage_range(vma, start, start + length); + spin_unlock(&mm->page_table_lock); +} + +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +{ + struct mm_struct *mm = current->mm; + struct inode *inode = mapping->host; + unsigned long addr; + int ret = 0; + + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + if (!pte_none(*pte)) + continue; + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + loff_t i_size; + page = alloc_hugetlb_page(); + if (!page) { + ret = -ENOMEM; + goto out; + } + add_to_page_cache(page, mapping, idx); + unlock_page(page); + i_size = (loff_t)(idx + 1) * HPAGE_SIZE; + if (i_size > inode->i_size) + inode->i_size = i_size; + + } + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + } +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct vm_area_struct *vmm; + + if (len > RGN_MAP_LIMIT) + return -ENOMEM; + if (len & ~HPAGE_MASK) + return -EINVAL; + /* This code assumes that REGION_HPAGE != 0. */ + if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1))) + addr = TASK_HPAGE_BASE; + else + addr = COLOR_HALIGN(addr); + for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { + /* At this point: (!vmm || addr < vmm->vm_end). */ + if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT) + return -ENOMEM; + if (!vmm || (addr + len) <= vmm->vm_start) + return addr; + addr = COLOR_HALIGN(vmm->vm_end); + } +} + +void update_and_free_page(struct page *page) +{ + int j; + struct page *map; + + map = page; + htlbzone_pages--; + for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) { + map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | + 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved); + set_page_count(map, 0); + map++; + } + set_page_count(page, 1); + __free_pages(page, HUGETLB_PAGE_ORDER); +} + +int try_to_free_low(int count) +{ + struct list_head *p; + struct page *page; + struct page *map; + + map = NULL; + spin_lock(&htlbpage_lock); + list_for_each(p, &htlbpage_freelist) { + if (map) { + list_del(&map->list); + update_and_free_page(map); + htlbpagemem--; + map = NULL; + if (++count == 0) + break; + } + page = list_entry(p, struct page, list); + if ((page_zone(page))->name[0] != 'H') //Look for non-Highmem zones. + map = page; + + } + if (map) { + list_del(&map->list); + update_and_free_page(map); + htlbpagemem--; + count++; + } + spin_unlock(&htlbpage_lock); + return count; +} + +int set_hugetlb_mem_size(int count) +{ + int j, lcount; + struct page *page, *map; + extern long htlbzone_pages; + extern struct list_head htlbpage_freelist; + + if (count < 0) + lcount = count; + else + lcount = count - htlbzone_pages; + + if (lcount == 0) return (int)htlbzone_pages; + if (lcount > 0) { /* Increase the mem size. */ + while (lcount--) { + page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER); + if (page == NULL) + break; + map = page; + for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) { + SetPageReserved(map); + map++; + } + spin_lock(&htlbpage_lock); + list_add(&page->list, &htlbpage_freelist); + htlbpagemem++; + htlbzone_pages++; + spin_unlock(&htlbpage_lock); + } + return (int) htlbzone_pages; + } + /* Shrink the memory size. */ + lcount = try_to_free_low(lcount); + while (lcount++ < 0) { + page = alloc_hugetlb_page(); + if (page == NULL) + break; + spin_lock(&htlbpage_lock); + update_and_free_page(page); + spin_unlock(&htlbpage_lock); + } + return (int) htlbzone_pages; +} + +int hugetlb_sysctl_handler(ctl_table *table, int write, struct file *file, void *buffer, size_t *length) +{ + proc_dointvec(table, write, file, buffer, length); + htlbpage_max = set_hugetlb_mem_size(htlbpage_max); + return 0; +} + +static int __init hugetlb_setup(char *s) +{ + if (sscanf(s, "%d", &htlbpage_max) <= 0) + htlbpage_max = 0; + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static int __init hugetlb_init(void) +{ + int i, j; + struct page *page; + + for (i = 0; i < htlbpage_max; ++i) { + page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER); + if (!page) + break; + for (j = 0; j < HPAGE_SIZE/PAGE_SIZE; ++j) + SetPageReserved(&page[j]); + spin_lock(&htlbpage_lock); + list_add(&page->list, &htlbpage_freelist); + spin_unlock(&htlbpage_lock); + } + htlbpage_max = htlbpagemem = htlbzone_pages = i; + printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem); + return 0; +} +module_init(hugetlb_init); + +int hugetlb_report_meminfo(char *buf) +{ + return sprintf(buf, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "Hugepagesize: %5lu kB\n", + htlbzone_pages, + htlbpagemem, + HPAGE_SIZE/1024); +} + +int is_hugepage_mem_enough(size_t size) +{ + if (size > (htlbpagemem << HPAGE_SHIFT)) + return 0; + return 1; +} + +static struct page * hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused) +{ + BUG(); + return NULL; +} + +struct vm_operations_struct hugetlb_vm_ops = { + .nopage = hugetlb_nopage, +}; diff -Nru a/fs/Config.in b/fs/Config.in --- a/fs/Config.in Wed Oct 8 09:09:54 2003 +++ b/fs/Config.in Wed Oct 8 09:09:54 2003 @@ -50,6 +50,10 @@ bool 'Virtual memory file system support (former shm fs)' CONFIG_TMPFS define_bool CONFIG_RAMFS y +if [ "$CONFIG_HUGETLB_PAGE" = "y" ]; then + tristate 'HugeTLB file system support' CONFIG_HUGETLBFS +fi + tristate 'ISO 9660 CDROM file system support' CONFIG_ISO9660_FS dep_mbool ' Microsoft Joliet CDROM extensions' CONFIG_JOLIET $CONFIG_ISO9660_FS dep_mbool ' Transparent decompression extension' CONFIG_ZISOFS $CONFIG_ISO9660_FS diff -Nru a/fs/Makefile b/fs/Makefile --- a/fs/Makefile Wed Oct 8 09:09:54 2003 +++ b/fs/Makefile Wed Oct 8 09:09:54 2003 @@ -31,6 +31,7 @@ subdir-$(CONFIG_EXT2_FS) += ext2 subdir-$(CONFIG_CRAMFS) += cramfs subdir-$(CONFIG_RAMFS) += ramfs +subdir-$(CONFIG_HUGETLBFS) += hugetlbfs subdir-$(CONFIG_CODA_FS) += coda subdir-$(CONFIG_INTERMEZZO_FS) += intermezzo subdir-$(CONFIG_MINIX_FS) += minix diff -Nru a/fs/hugetlbfs/Makefile b/fs/hugetlbfs/Makefile --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hugetlbfs/Makefile Wed Oct 8 09:09:54 2003 @@ -0,0 +1,11 @@ +# +# Makefile for the linux hugetlbfs routines. +# + +O_TARGET := hugetlbfs.o + +obj-y := inode.o + +obj-m := $(O_TARGET) + +include $(TOPDIR)/Rules.make diff -Nru a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hugetlbfs/inode.c Wed Oct 8 09:09:54 2003 @@ -0,0 +1,607 @@ +/* + * hugetlbpage-backed filesystem. Based on ramfs. + * + * William Irwin, 2002 + * + * Copyright (C) 2002 Linus Torvalds. + * Backported from 2.5.48 11/19/2002 Rohit Seth + */ + +#include +#include +#include /* remove ASAP */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern struct list_head inode_unused; + +/* some random number */ +#define HUGETLBFS_MAGIC 0x958458f6 + +static struct super_operations hugetlbfs_ops; +static struct address_space_operations hugetlbfs_aops; +struct file_operations hugetlbfs_file_operations; +static struct inode_operations hugetlbfs_dir_inode_operations; +static struct inode_operations hugetlbfs_inode_operations; + +static inline int hugetlbfs_positive(struct dentry *dentry) +{ + return dentry->d_inode && ! d_unhashed(dentry); +} + +static int hugetlbfs_empty(struct dentry *dentry) +{ + struct list_head *list; + spin_lock (&dcache_lock); + list = dentry->d_subdirs.next; + while (list != &dentry->d_subdirs) { + struct dentry *de = list_entry(list, struct dentry, d_child); + if (hugetlbfs_positive(de)) { + spin_unlock(&dcache_lock); + return 0; + } + list = list->next; + } + spin_unlock(&dcache_lock); + return 1; +} + +int hugetlbfs_sync_file(struct file * file, struct dentry *dentry, int datasync) +{ + return 0; +} +static int hugetlbfs_statfs(struct super_block *sb, struct statfs *buf) +{ + buf->f_type = HUGETLBFS_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = 255; + return 0; +} + +static int hugetlbfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + int error = - ENOTEMPTY; + + if (hugetlbfs_empty(new_dentry)) { + struct inode *inode = new_dentry->d_inode; + if (inode) { + inode->i_nlink--; + dput(new_dentry); + } + error = 0; + } + return error; +} +static int hugetlbfs_unlink(struct inode *dir, struct dentry *dentry) +{ + if (!hugetlbfs_empty(dentry)) + return -ENOTEMPTY; + dentry->d_inode->i_nlink--; + dput (dentry); + return 0; +} + +#define hugetlbfs_rmdir hugetlbfs_unlink + +static int hugetlbfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + if (S_ISDIR(inode->i_mode)) + return -EPERM; + inode->i_nlink++; + atomic_inc(&inode->i_count); + dget(dentry); + d_instantiate(dentry, inode); + return 0; +} + +static struct dentry *hugetlbfs_lookup(struct inode *dir, struct dentry *dentry) +{ + d_add(dentry, NULL); + return NULL; +} + +static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode =file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + loff_t len; + int ret; + + if (vma->vm_start & ~HPAGE_MASK) + return -EINVAL; + + if (vma->vm_end & ~HPAGE_MASK) + return -EINVAL; + + if (vma->vm_end - vma->vm_start < HPAGE_SIZE) + return -EINVAL; +#ifdef CONFIG_IA64 + if (vma->vm_start < (REGION_HPAGE << REGION_SHIFT)) + return -EINVAL; +#endif + down(&inode->i_sem); + + UPDATE_ATIME(inode); + vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_ops = &hugetlb_vm_ops; + ret = hugetlb_prefault(mapping, vma); + up(&inode->i_sem); + return ret; +} + +/* + * Called under down_write(mmap_sem), page_table_lock is not held + */ + +#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags); +#else +static unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len & ~HPAGE_MASK) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = COLOR_HALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) + return -ENOMEM; + if (!vma || addr + len <= vma->vm_start) + return addr; + addr = COLOR_HALIGN(vma->vm_end); + } +} +#endif + +/* + * Read a page. Again trivial. If it didn't already exist + * in the page cache, it is zero-filled. + */ +static int hugetlbfs_readpage(struct file *file, struct page * page) +{ + return -EINVAL; +} + +static int hugetlbfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) +{ + return -EINVAL; +} + +static int hugetlbfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) +{ + return -EINVAL; +} + +void truncate_huge_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + ClearPageDirty(page); + ClearPageUptodate(page); + remove_inode_page(page); + set_page_count(page, 1); + huge_page_release(page); +} + +void truncate_hugepages(struct inode *inode, struct address_space *mapping, loff_t lstart) +{ + unsigned long start = lstart >> HPAGE_SHIFT; + unsigned long next; + unsigned long max_idx; + struct page *page; + + max_idx = inode->i_size >> HPAGE_SHIFT; + next = start; + while (next < max_idx) { + page = find_lock_page(mapping, next); + next++; + if (page == NULL) + continue; + page_cache_release(page); + truncate_huge_page(mapping, page); + unlock_page(page); + } + +} + +static void hugetlbfs_delete_inode(struct inode *inode) +{ + list_del_init(&inode->i_hash); + list_del_init(&inode->i_list); + inode->i_state |= I_FREEING; + inodes_stat.nr_inodes--; + + if (inode->i_data.nrpages) + truncate_hugepages(inode, &inode->i_data, 0); + +} + +static void hugetlbfs_forget_inode(struct inode *inode) +{ + struct super_block *super_block = inode->i_sb; + + if (list_empty(&inode->i_hash)) + goto out_truncate; + + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + if (!super_block || (super_block->s_flags & MS_ACTIVE)) { + return; + } + + /* write_inode_now() ? */ + inodes_stat.nr_unused--; + list_del_init(&inode->i_hash); +out_truncate: + list_del_init(&inode->i_list); + inode->i_state |= I_FREEING; + inodes_stat.nr_inodes--; + if (inode->i_data.nrpages) + truncate_hugepages(inode, &inode->i_data, 0); +} + +static void hugetlbfs_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + hugetlbfs_delete_inode(inode); + else + hugetlbfs_forget_inode(inode); +} + +static void hugetlb_vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) +{ + + do { + unsigned long h_vm_pgoff; + unsigned long v_length; + unsigned long h_length; + unsigned long v_offset; + + h_vm_pgoff = mpnt->vm_pgoff << (HPAGE_SHIFT - PAGE_SHIFT); + v_length = mpnt->vm_end - mpnt->vm_start; + h_length = v_length >> HPAGE_SHIFT; + v_offset = (pgoff - h_vm_pgoff) << HPAGE_SHIFT; + + /* + * Is this VMA fully outside the truncation point? + */ + if (h_vm_pgoff >= pgoff) { + zap_hugepage_range(mpnt, mpnt->vm_start, v_length); + continue; + } + + /* + * Is this VMA fully inside the truncaton point? + */ + if (h_vm_pgoff + (v_length >> HPAGE_SHIFT) <= pgoff) + continue; + + /* + * The VMA straddles the truncation point. v_offset is the + * offset (in bytes) into the VMA where the point lies. + */ + zap_hugepage_range(mpnt, + mpnt->vm_start + v_offset, + v_length - v_offset); + } while ((mpnt = mpnt->vm_next_share) != NULL); +} + + +/* + * Expanding truncates are not allowed. + */ +static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) +{ + unsigned long pgoff; + struct address_space *mapping = inode->i_mapping; + + if (offset > inode->i_size) + return -EINVAL; + + BUG_ON(offset & ~HPAGE_MASK); + pgoff = offset >> HPAGE_SHIFT; + + spin_lock(&mapping->i_shared_lock); + if (mapping->i_mmap != NULL) + hugetlb_vmtruncate_list(mapping->i_mmap, pgoff); + if (mapping->i_mmap_shared != NULL) + hugetlb_vmtruncate_list(mapping->i_mmap_shared, pgoff); + + spin_unlock(&mapping->i_shared_lock); + truncate_hugepages(inode, mapping, offset); + inode->i_size = offset; + return 0; + +} + +static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + unsigned int ia_valid = attr->ia_valid; + unsigned long dn_mask; + + BUG_ON(!inode); + + error = inode_change_ok(inode, attr); + if (error) + goto out; + + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) + error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + if (error) + goto out; + + if (ia_valid & ATTR_SIZE) { + error = -EINVAL; + if (!(attr->ia_size & ~HPAGE_MASK)) + error = hugetlb_vmtruncate(inode, attr->ia_size); + if (error) + goto out; + attr->ia_valid &= ~ATTR_SIZE; + error = inode_setattr(inode, attr); + } +#if 0 + if (error) + goto out; + dn_mask = setattr_mask(ia_valid); + if (dn_mask) + dnotify_parent(dentry, dn_mask); +#endif +out: + return error; +} + +struct inode *hugetlbfs_get_inode(struct super_block *sb, int mode, int dev) +{ + struct inode * inode = new_inode(sb); + + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_rdev = NODEV; + inode->i_mapping->a_ops = &hugetlbfs_aops; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &hugetlbfs_inode_operations; + inode->i_fop = &hugetlbfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &hugetlbfs_dir_inode_operations; + inode->i_fop = &dcache_dir_ops; + + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + break; + } + } + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int hugetlbfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev) +{ + struct inode * inode = hugetlbfs_get_inode(dir->i_sb, mode, dev); + int error = -ENOSPC; + + if (inode) { + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + } + return error; +} + +static int hugetlbfs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); +// if (!retval) + //dir->i_nlink++; + return retval; +} + +static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode) +{ + return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); +} + +static int hugetlbfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname) +{ + int error = -ENOSPC; + + error = hugetlbfs_mknod(dir, dentry, S_IFLNK|S_IRWXUGO, 0); + if (!error) { + int l = strlen(symname)+1; + struct inode *inode = dentry->d_inode; + error = block_symlink(inode, symname, l); + } + return error; +} + +static struct address_space_operations hugetlbfs_aops = { +readpage : hugetlbfs_readpage, +writepage : fail_writepage, +prepare_write : hugetlbfs_prepare_write, +commit_write : hugetlbfs_commit_write +}; + +struct file_operations hugetlbfs_file_operations = { +mmap : hugetlbfs_file_mmap, +fsync : hugetlbfs_sync_file, +get_unmapped_area : hugetlb_get_unmapped_area, +}; + +static struct inode_operations hugetlbfs_dir_inode_operations = { +create : hugetlbfs_create, +lookup : hugetlbfs_lookup, +link : hugetlbfs_link, +unlink : hugetlbfs_unlink, +symlink : hugetlbfs_symlink, +mkdir : hugetlbfs_mkdir, +rmdir : hugetlbfs_rmdir, +mknod : hugetlbfs_mknod, +rename : hugetlbfs_rename, +setattr : hugetlbfs_setattr, +}; + +static struct inode_operations hugetlbfs_inode_operations = { +setattr: hugetlbfs_setattr, +}; + +static struct super_operations hugetlbfs_ops = { +statfs : hugetlbfs_statfs, +put_inode : hugetlbfs_drop_inode, +}; + +static struct super_block * hugetlbfs_fill_super(struct super_block * sb, void * data, int silent) +{ + struct inode * inode; + struct dentry * root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = HUGETLBFS_MAGIC; + sb->s_op = &hugetlbfs_ops; + inode = hugetlbfs_get_inode(sb, S_IFDIR | 0755, 0); + if (!inode) + return NULL; + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return NULL; + } + sb->s_root = root; + return sb; +} + +static DECLARE_FSTYPE(hugetlbfs_fs_type, "hugetlbfs", hugetlbfs_fill_super, FS_LITTER); + +static struct vfsmount *hugetlbfs_vfsmount; + +static atomic_t hugetlbfs_counter = ATOMIC_INIT(0); + +struct file *hugetlb_zero_setup(size_t size) +{ + int error, n; + struct file *file; + struct inode *inode; + struct dentry *dentry, *root; + struct qstr quick_string; + char buf[16]; + + if (!is_hugepage_mem_enough(size)) + return ERR_PTR(-ENOMEM); + n = atomic_read(&hugetlbfs_counter); + atomic_inc(&hugetlbfs_counter); + + root = hugetlbfs_vfsmount->mnt_root; + snprintf(buf, 16, "%d", n); + quick_string.name = buf; + quick_string.len = strlen(quick_string.name); + quick_string.hash = 0; + dentry = d_alloc(root, &quick_string); + if (!dentry) + return ERR_PTR(-ENOMEM); + + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto out_dentry; + + error = -ENOSPC; + inode = hugetlbfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto out_file; + + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; + file->f_vfsmnt = mntget(hugetlbfs_vfsmount); + file->f_dentry = dentry; + file->f_op = &hugetlbfs_file_operations; + file->f_mode = FMODE_WRITE | FMODE_READ; + return file; + +out_file: + put_filp(file); +out_dentry: + dput(dentry); + return ERR_PTR(error); +} + +static int __init init_hugetlbfs_fs(void) +{ + int error; + struct vfsmount *vfsmount; + + error = register_filesystem(&hugetlbfs_fs_type); + if (error) + return error; + + vfsmount = kern_mount(&hugetlbfs_fs_type); + + if (!IS_ERR(vfsmount)) { + printk("Hugetlbfs mounted.\n"); + hugetlbfs_vfsmount = vfsmount; + return 0; + } + + printk("Error in mounting hugetlbfs.\n"); + error = PTR_ERR(vfsmount); + return error; +} + +static void __exit exit_hugetlbfs_fs(void) +{ + unregister_filesystem(&hugetlbfs_fs_type); +} + +module_init(init_hugetlbfs_fs) +module_exit(exit_hugetlbfs_fs) + +MODULE_LICENSE("GPL"); diff -Nru a/fs/inode.c b/fs/inode.c --- a/fs/inode.c Wed Oct 8 09:09:54 2003 +++ b/fs/inode.c Wed Oct 8 09:09:54 2003 @@ -56,7 +56,7 @@ */ static LIST_HEAD(inode_in_use); -static LIST_HEAD(inode_unused); +LIST_HEAD(inode_unused); static struct list_head *inode_hashtable; static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ diff -Nru a/fs/proc/array.c b/fs/proc/array.c --- a/fs/proc/array.c Wed Oct 8 09:09:54 2003 +++ b/fs/proc/array.c Wed Oct 8 09:09:54 2003 @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -489,6 +490,18 @@ pgd_t *pgd = pgd_offset(mm, vma->vm_start); int pages = 0, shared = 0, dirty = 0, total = 0; + if (is_vm_hugetlb_page(vma)) { + int num_pages = ((vma->vm_end - vma->vm_start)/PAGE_SIZE); + resident +=num_pages; + if (!(vma->vm_flags & VM_DONTCOPY)) + share += num_pages; + if (vma->vm_flags & VM_WRITE) + dt += num_pages; + drs += num_pages; + vma = vma->vm_next; + continue; + + } statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total); resident += pages; share += shared; diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c Wed Oct 8 09:09:54 2003 +++ b/fs/proc/proc_misc.c Wed Oct 8 09:09:54 2003 @@ -208,6 +208,8 @@ K(i.totalswap), K(i.freeswap)); + len += hugetlb_report_meminfo(page + len); + return proc_calc_metrics(page, start, off, count, eof, len); #undef B #undef K diff -Nru a/include/asm-ia64/mmu_context.h b/include/asm-ia64/mmu_context.h --- a/include/asm-ia64/mmu_context.h Wed Oct 8 09:09:54 2003 +++ b/include/asm-ia64/mmu_context.h Wed Oct 8 09:09:54 2003 @@ -99,6 +99,10 @@ rr2 = rr0 + 2*rid_incr; rr3 = rr0 + 3*rid_incr; rr4 = rr0 + 4*rid_incr; +#ifdef CONFIG_HUGETLB_PAGE + rr4 = (rr4 & (~(0xfcUL))) | (HPAGE_SHIFT << 2); +#endif + ia64_set_rr(0x0000000000000000, rr0); ia64_set_rr(0x2000000000000000, rr1); ia64_set_rr(0x4000000000000000, rr2); diff -Nru a/include/asm-ia64/page.h b/include/asm-ia64/page.h --- a/include/asm-ia64/page.h Wed Oct 8 09:09:54 2003 +++ b/include/asm-ia64/page.h Wed Oct 8 09:09:54 2003 @@ -30,6 +30,34 @@ #define PAGE_MASK (~(PAGE_SIZE - 1)) #define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) +#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE_SIZE_4GB) +#define HPAGE_SHIFT 32 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1GB) +#define HPAGE_SHIFT 30 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_256MB) +#define HPAGE_SHIFT 28 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64MB) +#define HPAGE_SHIFT 26 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_16MB) +#define HPAGE_SHIFT 24 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) +#define HPAGE_SHIFT 22 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_1MB) +#define HPAGE_SHIFT 20 +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_256KB) +#define HPAGE_SHIFT 18 +#else +# error Unsupported IA-64 HugeTLB Page Size! +#endif + +#define REGION_HPAGE (4UL) +#define REGION_SHIFT 61 +#define HPAGE_SIZE (__IA64_UL_CONST(1) << HPAGE_SHIFT) +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#endif + #ifdef __ASSEMBLY__ # define __pa(x) ((x) - PAGE_OFFSET) # define __va(x) ((x) + PAGE_OFFSET) @@ -93,6 +121,14 @@ #define REGION_SIZE REGION_NUMBER(1) #define REGION_KERNEL 7 + +#ifdef CONFIG_HUGETLB_PAGE +#define htlbpage_to_page(x) ((REGION_NUMBER(x) << 61) | (REGION_OFFSET(x) >> (HPAGE_SHIFT-PAGE_SHIFT))) +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +extern int is_invalid_hugepage_range(unsigned long addr, unsigned long len); +#else +#define is_invalid_hugepage_range(addr, len) 0 +#endif #define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); *(int *)0=0; } while (0) #define PAGE_BUG(page) do { BUG(); } while (0) diff -Nru a/include/linux/hugetlb.h b/include/linux/hugetlb.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/hugetlb.h Wed Oct 8 09:09:54 2003 @@ -0,0 +1,72 @@ +#ifndef _LINUX_HUGETLB_H +#define _LINUX_HUGETLB_H + +#ifdef CONFIG_HUGETLB_PAGE + +#define COLOR_HALIGN(addr) ((addr + HPAGE_SIZE - 1) & ~(HPAGE_SIZE - 1)) +struct ctl_table; + +static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_HUGETLB; +} +static inline int is_hugepage_addr(unsigned long addr) +{ + return (rgn_index(addr) == REGION_HPAGE); +} + +int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void *, size_t *); +int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); +int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); +void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); +void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); +int hugetlb_prefault(struct address_space *, struct vm_area_struct *); +void huge_page_release(struct page *); +int hugetlb_report_meminfo(char *); +int is_hugepage_mem_enough(size_t); +int is_aligned_hugepage_range(unsigned long addr, unsigned long len); + +extern int htlbpage_max; + +#else /* !CONFIG_HUGETLB_PAGE */ +static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return 0; +} + +#define follow_hugetlb_page(m,v,p,vs,a,b,i) ({ BUG(); 0; }) +#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) +#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) +#define zap_hugepage_range(vma, start, len) BUG() +#define unmap_hugepage_range(vma, start, end) BUG() +#define huge_page_release(page) BUG() +#define hugetlb_report_meminfo(buf) 0 +#define is_hugepage_mem_enough(size) 0 +#define is_hugepage_addr(addr) 0 +#define is_aligned_hugepage_range(addr, len) 0 + +#endif /* !CONFIG_HUGETLB_PAGE */ + +#ifdef CONFIG_HUGETLBFS +extern struct file_operations hugetlbfs_file_operations; +extern struct vm_operations_struct hugetlb_vm_ops; +struct file *hugetlb_zero_setup(size_t); + +static inline int is_file_hugepages(struct file *file) +{ + return file->f_op == &hugetlbfs_file_operations; +} + +static inline void set_file_hugepages(struct file *file) +{ + file->f_op = &hugetlbfs_file_operations; +} +#else /* !CONFIG_HUGETLBFS */ + +#define is_file_hugepages(file) 0 +#define set_file_hugepages(file) BUG() +#define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) + +#endif /* !CONFIG_HUGETLBFS */ + +#endif /* _LINUX_HUGETLB_H */ diff -Nru a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h Wed Oct 8 09:09:54 2003 +++ b/include/linux/mm.h Wed Oct 8 09:09:54 2003 @@ -105,6 +105,7 @@ #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_WRITECOMBINED 0x00100000 /* Write-combined */ #define VM_NONCACHED 0x00200000 /* Noncached access */ +#define VM_HUGETLB 0x00400000 /* Huge tlb Page*/ #define VM_STACK_FLAGS (VM_DATA_DEFAULT_FLAGS | VM_GROWSDOWN) diff -Nru a/include/linux/shm.h b/include/linux/shm.h --- a/include/linux/shm.h Wed Oct 8 09:09:54 2003 +++ b/include/linux/shm.h Wed Oct 8 09:09:54 2003 @@ -75,6 +75,7 @@ /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ #define SHM_LOCKED 02000 /* segment will not be swapped */ +#define SHM_HUGETLB 04000 /* segment will use HugeTLB pages */ asmlinkage long sys_shmget (key_t key, size_t size, int flag); asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, unsigned long *addr); diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h --- a/include/linux/sysctl.h Wed Oct 8 09:09:54 2003 +++ b/include/linux/sysctl.h Wed Oct 8 09:09:54 2003 @@ -144,6 +144,7 @@ VM_MAX_MAP_COUNT=11, /* int: Maximum number of active map areas */ VM_MIN_READAHEAD=12, /* Min file readahead */ VM_MAX_READAHEAD=13, /* Max file readahead */ + VM_HUGETLB_PAGES=14 /* Number of available huge pages */ }; diff -Nru a/ipc/shm.c b/ipc/shm.c --- a/ipc/shm.c Wed Oct 8 09:09:54 2003 +++ b/ipc/shm.c Wed Oct 8 09:09:54 2003 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -125,6 +126,7 @@ shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; shm_rmid (shp->id); shm_unlock(shp->id); + if(!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 0); fput (shp->shm_file); kfree (shp); @@ -193,8 +195,13 @@ shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_USER); if (!shp) return -ENOMEM; + if(shmflg & SHM_HUGETLB) + file = hugetlb_zero_setup(size); + else + { sprintf (name, "SYSV%08x", key); file = shmem_file_setup(name, size); + } error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; @@ -214,6 +221,9 @@ shp->id = shm_buildid(id,shp->shm_perm.seq); shp->shm_file = file; file->f_dentry->d_inode->i_ino = shp->id; + if(shmflg & SHM_HUGETLB) + set_file_hugepages(file); + else file->f_op = &shm_file_operations; shm_tot += numpages; shm_unlock (id); @@ -452,7 +462,10 @@ tbuf.shm_ctime = shp->shm_ctim; tbuf.shm_cpid = shp->shm_cprid; tbuf.shm_lpid = shp->shm_lprid; + if(!is_file_hugepages(shp->shm_file)) tbuf.shm_nattch = shp->shm_nattch; + else + tbuf.shm_nattch = file_count(shp->shm_file)-1; shm_unlock(shmid); if(copy_shmid_to_user (buf, &tbuf, version)) return -EFAULT; @@ -474,9 +487,11 @@ if(err) goto out_unlock; if(cmd==SHM_LOCK) { + if(!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 1); shp->shm_flags |= SHM_LOCKED; } else { + if(!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 0); shp->shm_flags &= ~SHM_LOCKED; } @@ -678,7 +693,7 @@ down_write(&mm->mmap_sem); for (shmd = mm->mmap; shmd; shmd = shmdnext) { shmdnext = shmd->vm_next; - if (shmd->vm_ops == &shm_vm_ops + if (((shmd->vm_ops == &shm_vm_ops) || is_vm_hugetlb_page(shmd)) && shmd->vm_start - (shmd->vm_pgoff << PAGE_SHIFT) == (ulong) shmaddr) { do_munmap(mm, shmd->vm_start, shmd->vm_end - shmd->vm_start); retval = 0; @@ -718,7 +733,7 @@ shp->shm_segsz, shp->shm_cprid, shp->shm_lprid, - shp->shm_nattch, + is_file_hugepages(shp->shm_file) ? (file_count(shp->shm_file)-1) : shp->shm_nattch, shp->shm_perm.uid, shp->shm_perm.gid, shp->shm_perm.cuid, diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c --- a/kernel/sysctl.c Wed Oct 8 09:09:54 2003 +++ b/kernel/sysctl.c Wed Oct 8 09:09:54 2003 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -280,6 +281,10 @@ &vm_max_readahead,sizeof(int), 0644, NULL, &proc_dointvec}, {VM_MAX_MAP_COUNT, "max_map_count", &max_map_count, sizeof(int), 0644, NULL, &proc_dointvec}, +#ifdef CONFIG_HUGETLB_PAGE + {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, + &hugetlb_sysctl_handler}, +#endif {0} }; diff -Nru a/mm/memory.c b/mm/memory.c --- a/mm/memory.c Wed Oct 8 09:09:54 2003 +++ b/mm/memory.c Wed Oct 8 09:09:54 2003 @@ -37,6 +37,7 @@ */ #include +#include #include #include #include @@ -181,6 +182,9 @@ unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + if (is_vm_hugetlb_page(vma)) { + return copy_hugetlb_page_range(dst, src, vma); + } src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -473,6 +477,10 @@ if ( !vma || (pages && vma->vm_flags & VM_IO) || !(flags & vma->vm_flags) ) return i ? : -EFAULT; + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &len, i); + continue; + } spin_lock(&mm->page_table_lock); do { struct page *map; @@ -1370,6 +1378,9 @@ current->state = TASK_RUNNING; pgd = pgd_offset(mm, address); + + if(is_vm_hugetlb_page(vma)) + return 0; /* mapping truncation does this. */ /* * We need the page table lock to synchronize with kswapd diff -Nru a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c Wed Oct 8 09:09:54 2003 +++ b/mm/mmap.c Wed Oct 8 09:09:54 2003 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -594,6 +595,9 @@ fput(file); /* Undo any partial mapping done by a device driver. */ + if(is_vm_hugetlb_page(vma)) + zap_hugepage_range(vma, vma->vm_start, vma->vm_end-vma->vm_start); + else zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); free_vma: kmem_cache_free(vm_area_cachep, vma); @@ -644,10 +648,26 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (flags & MAP_FIXED) { + unsigned long ret; + if (addr > TASK_SIZE - len) return -ENOMEM; if (addr & ~PAGE_MASK) return -EINVAL; + if (file && is_file_hugepages(file)) + /* If the request is for hugepages, then make sure that addr + * and length is properly aligned. + */ + ret = is_aligned_hugepage_range(addr, len); + else + /* + * Make sure that a normal request is not falling + * in reserved hugepage range. For some archs like IA-64, + * there is seperate region for hugepages. + */ + ret = is_invalid_hugepage_range(addr, len); + if (ret) + return ret; return addr; } @@ -941,6 +961,12 @@ return 0; /* we have addr < mpnt->vm_end */ + if (is_vm_hugetlb_page(mpnt)) { + int ret = is_aligned_hugepage_range(addr, len); + if (ret) + return ret; + } + if (mpnt->vm_start >= addr+len) return 0; @@ -994,6 +1020,9 @@ remove_shared_vm_struct(mpnt); mm->map_count--; + if (is_vm_hugetlb_page(mpnt)) + zap_hugepage_range(mpnt, st, size); + else zap_page_range(mm, st, size); /* @@ -1151,6 +1180,9 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); + if (is_vm_hugetlb_page(mpnt)) + zap_hugepage_range(mpnt, start, size); + else zap_page_range(mm, start, size); if (mpnt->vm_file) fput(mpnt->vm_file); diff -Nru a/mm/mprotect.c b/mm/mprotect.c --- a/mm/mprotect.c Wed Oct 8 09:09:54 2003 +++ b/mm/mprotect.c Wed Oct 8 09:09:54 2003 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -294,6 +295,10 @@ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + if (is_vm_hugetlb_page(vma)) { + error = -EACCES; + goto out; + } newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); if ((newflags & ~(newflags >> 4)) & 0xf) { error = -EACCES; diff -Nru a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c Wed Oct 8 09:09:54 2003 +++ b/mm/mremap.c Wed Oct 8 09:09:54 2003 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -267,6 +268,10 @@ vma = find_vma(current->mm, addr); if (!vma || vma->vm_start > addr) goto out; + if (is_vm_hugetlb_page(vma)) { + ret = -EINVAL; + goto out; + } /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) goto out;